#we upload the dataset
total_500 <- read.csv("~/GitHub/thesis_msc_business_analytics/total_500.csv", sep=";", na.strings="n/a")
#total_500 <- read.csv("F:/Dropbox/Dani/Spinellis - Diplwmatiki/Jupyter markdown/total_500.csv", sep=";")
#we see how many observations and how many variables we have and then the names of the variables we have
dim(total_500)
## [1] 500 741
#names(total_500)
#str(total_500)
total_500_sub <- total_500
#str(total_500_sub[720:741])
#Change the decimal point for the 4 variables
total_500_sub$Assets.. <- gsub(",", ".", total_500_sub$Assets.. )
total_500_sub$Market.value.. <- gsub(",", ".", total_500_sub$Market.value.. )
total_500_sub$Revenues.. <- gsub(",", ".", total_500_sub$Revenues.. )
total_500_sub$Total.Stockholder.Equity.. <- gsub(",", ".", total_500_sub$Total.Stockholder.Equity.. )
#Make the variables numeric
for(i in 1:741){
total_500_sub[,i] <- as.numeric(total_500_sub[,i])}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
#str(total_500_sub[720:741])
#We omit the nas from the analysis
total_500_final <- na.omit(total_500_sub)
#we remove the extra value X since it is not necessary for the analysis
#str(total_500_final)
#we make the variables from fortune binomial so as to be more easily examined
#In order to achieve that we first see their summary and then we create their histogram so as to have a
#good grasp of how they are distributed
#we upload the libraries beneath that we will use in the analysis
library(ggplot2)
library(reshape2)
library(DAAG)
## Loading required package: lattice
summary(total_500_final$Revenues..)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.130 7.024 11.020 23.530 20.880 482.100
ggplot(data=total_500_final,aes(x=Revenues..))+geom_histogram(binwidth=50, colour = "green", fill ="darkgreen")

total_500_final$Revenues.. <- cut(total_500_final$Revenues..,c(-1,50,483))
summary(total_500_final$Assets..)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.077 7.571 17.080 50.470 43.280 877.900
ggplot(data=total_500_final,aes(x=Assets..))+geom_histogram(binwidth=100, colour = "red", fill ="darkred")

total_500_final$Assets.. <- cut(total_500_final$Assets..,c(-1,500,1000))
summary(total_500_final$Market.value..)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.022 6.924 15.770 53.120 39.450 947.000
ggplot(data=total_500_final,aes(x=Market.value..))+geom_histogram(binwidth=100, colour = "blue", fill ="darkblue")

total_500_final$Market.value.. <- cut(total_500_final$Market.value..,c(-1,400,1000))
summary(total_500_final$Total.Stockholder.Equity..)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -978.000 2.746 6.427 69.790 20.600 998.000
ggplot(data=total_500_final,aes(x=Total.Stockholder.Equity..))+geom_histogram(binwidth=100, colour = "purple", fill ="pink")

total_500_final$Total.Stockholder.Equity.. <- cut(total_500_final$Total.Stockholder.Equity..,c(-1000,250,1000))
#names(total_500_final)
#######################################################################################################
#names(total_500_final)[1]
#Change the names of some variables to be more easily readable
colnames(total_500_final)[1] <- "Ranking"
colnames(total_500_final)[2] <- "Assets"
colnames(total_500_final)[3] <- "Market_Value"
colnames(total_500_final)[4] <- "Revenues"
colnames(total_500_final)[6] <- "Total_SH_Equity"
total_500_final$Ranking <- cut(total_500_final$Ranking,c(-1,9,500)) #we start from -1 so as to take 0 also
total_500_final$Ranking <- as.numeric(total_500_final$Ranking)
#Delete the variables we will not need
total_500_final[5] <- NULL #Revenues %
total_500_final[6] <- NULL #company name
total_500_final[21] <- NULL # company url
total_500_final[20] <- NULL # readability index
#names(total_500_final)
##########################################################################################################
#Firstly we will analyze the social media relevance with the sites.
#We will see how many of the sites have social media and what type of social media
par(mfrow=c(1,2))
#Facebook
social_media_facebook <- round(table(total_500_final$facebook)/439,3)
social_media_facebook
##
## 0 1
## 0.387 0.613
slicelable <- c(paste(38.8,"% no"),paste(61.2,"% yes"))
pie(social_media_facebook,label = slicelable,main="Share of companies with Facebook",col=rainbow(length(social_media_facebook)))
#Twitter
social_media_twitter <- round(table(total_500_final$twitter)/439,3)
social_media_twitter
##
## 0 1
## 0.337 0.663
slicelable <- c(paste(33.7,"% no"),paste(66.3,"% yes"))
pie(social_media_twitter,label = slicelable,main="Share of companies with Twitter",col=rainbow(length(social_media_twitter)))

#Instagram
social_media_instagram <- round(table(total_500_final$instagram)/439,3)
social_media_instagram
##
## 0 1
## 0.793 0.207
slicelable <- c(paste(79.1,"% no"),paste(20.9,"% yes"))
pie(social_media_instagram,label = slicelable,main="Share of companies with Instagram",col=rainbow(length(social_media_instagram)))
#Pinterest
social_media_pinterest <- round(table(total_500_final$pinterest)/439,3)
social_media_pinterest
##
## 0 1
## 0.904 0.096
slicelable <- c(paste(90.4,"% no"),paste(9.6,"% yes"))
pie(social_media_pinterest,label = slicelable,main="Share of companies with Pinterest",col=rainbow(length(social_media_pinterest)))

#Youtube
social_media_youtube <- round(table(total_500_final$youtube)/439,3)
social_media_youtube
##
## 0 1
## 0.44 0.56
slicelable <- c(paste(44.3,"% no"),paste(55.7,"% yes"))
pie(social_media_youtube,label = slicelable,main="Share of companies with Youtube",col=rainbow(length(social_media_youtube)))
#LinkedIn
social_media_linkedin <- round(table(total_500_final$linkedin)/439,3)
social_media_linkedin
##
## 0 1
## 0.453 0.547
slicelable <- c(paste(45.4,"% no"),paste(54.6,"% yes"))
pie(social_media_linkedin,label = slicelable,main="Share of companies with Linkedin",col=rainbow(length(social_media_linkedin)))

#And we can also see for correlations
total_500_social_media <- total_500_final[,c(1,13:18)]
par(mfrow=c(1,1))
library(corrplot)
library(caret)
sm <- cor(total_500_social_media)
sm
## Ranking facebook instagram linkedin pinterest
## Ranking 1.00000000 -0.02734447 -0.03491849 -0.0163476395 -0.0541523033
## facebook -0.02734447 1.00000000 0.37190977 0.5254634954 0.2585697482
## instagram -0.03491849 0.37190977 1.00000000 0.1495741648 0.3877026257
## linkedin -0.01634764 0.52546350 0.14957416 1.0000000000 0.0006024165
## pinterest -0.05415230 0.25856975 0.38770263 0.0006024165 1.0000000000
## twitter 0.02030500 0.68935688 0.32902155 0.5895924203 0.2319603498
## youtube -0.01219198 0.56775499 0.35104712 0.5025537056 0.2256875587
## twitter youtube
## Ranking 0.0203050 -0.01219198
## facebook 0.6893569 0.56775499
## instagram 0.3290216 0.35104712
## linkedin 0.5895924 0.50255371
## pinterest 0.2319603 0.22568756
## twitter 1.0000000 0.56243719
## youtube 0.5624372 1.00000000
corrplot(cor(total_500_social_media),method="number")

#The most high correlation is between facebook and twitter 69%
#While the second highest is between twitter and linkedIn 59%
#########################################################################################################
#We will now check the links by creating an histogram
#Then we create ggplots in order to see in what frequency the links appear
par(mfrow=c(1,1))
library(ggplot2)
ggplot(data=total_500_final,aes(x=total.links))+geom_histogram(binwidth=50, colour = "darkblue", fill ="blue")

ggplot(data=total_500_final,aes(x=external))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(data=total_500_final,aes(x=internal))+geom_histogram(binwidth=50, colour = "darkgreen", fill ="green")

#And we can also see for correlations
total_500_links <- total_500_final[,c(1,10:12)]
library(corrplot)
library(caret)
tl <- cor(total_500_links)
tl
## Ranking external internal total.links
## Ranking 1.00000000 -0.04728683 -0.07335184 -0.08149767
## external -0.04728683 1.00000000 0.07474887 0.32145362
## internal -0.07335184 0.07474887 1.00000000 0.96830449
## total.links -0.08149767 0.32145362 0.96830449 1.00000000
corrplot(cor(total_500_links),method="number")

#########################################################################################################
#Now we will see the loading time per site
ggplot(data=total_500_final,aes(x=loading.time))+geom_histogram(binwidth=1, colour = "pink", fill ="purple")

#########################################################################################################
#Now we will see the words in total and in unique count in relation with the readability index
ggplot(data=total_500_final,aes(x=total_words,fill=Readability))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(data=total_500_final,aes(x=unique_words, fill=Readability))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

#########################################################################################################
ggplot(data=total_500_final,aes(x=total.images))+geom_histogram(binwidth=100, colour = "darkred", fill ="red")

#########################################################################################################
#We will see now the frequency of image types that is being used
par(mfrow=c(2,2))
k = c(727:735)
for(i in 1:9){
a <- k[i]
image_type<- round(table(total_500_final[,a])/439,3)
barplot(image_type,xlab=names(total_500_final)[a],ylab = "Shares of images per site", col = "dark green")}


#It is obvious that the most common images type are .jpg,.png and .gif
#So they will be the ones that we will keep
#Delete the variables we will not need
total_500_final[727] <- NULL #.bmp
total_500_final[727] <- NULL #.dib
total_500_final[728] <- NULL # .jpe
total_500_final[728] <- NULL # .jpeg
total_500_final[730] <- NULL # .tif
total_500_final[730] <- NULL # .tiff
#names(total_500_final)
par(mfrow=c(1,1))

##########################################################################################################
#Now we will check the sizes of the images used
#2 means YES and 1 means NO
par(mfrow=c(2,2))
ks = c(22:726)
for(i in 1:705){
a <- ks[i]
image_type<- round(table(total_500_final[,a])/439,3)
barplot(image_type,xlab=names(total_500_final)[a],ylab = "Shares of images per site", col = "dark red")}
















































































































































































#Firstly we will keep the image sizes that exist in more than the half od the sites that we are examining
#So we will keep until the size x210x420 [304]
#names(total_500_final)
total_500_final <- total_500_final[,-c(305:726)]
#names(total_500_final)
#We will also subtrack the sizes that are not clear
#x1x1 [24], x11x8 [42], x [44], x1x700 [50], X1x10 [53], "X10x1" [54],"X1x660" [56],"X19x1"[57], 2x2 [60], x0x0 [74],"X1x110"[208], "autox100." [249],"autox200" [250], "X2x213" [255],
total_500_final <- total_500_final[,-c(24,42,44,50,53,54,56,57,60,74,208,249,250,255)]
#names(total_500_final)
#We still have many bariables in order to make a regression model
#So we will group the sizes based on the following 5 categories so as to have a more calable information
#If at least one of the dimensions belongs to a category we choose the higher category that a dimension belongs
#Very large size: more than 800pixels
#Large: 500 - 799 pixels
#Medium: 300 - 499 pixels
#Small: 100 - 299 pixels
#Thumbnail: less than 100 pixels
############################################################################################################
verylarge <- c(24,34,53,54,61,62,69,70,74,75,76,80,81,83,97,98,99,114,122,123,131,138,140,141,145,153,157,182,209,210,212,213,214,215,216,217,219,220,221,222,228,234,258,274,277,280,285,286,287)
total_500_final$im_s_verylarge <- 0
k<-0
for(i in 1:49){
k <- verylarge[i]
for(i in 1:439){
total_500_final$im_s_verylarge[i] <- total_500_final$im_s_verylarge[i] + total_500_final[i,k]
}}
total_500_final$im_s_verylarge <- total_500_final$im_s_verylarge/length(verylarge)
for(i in 1:439){
if (total_500_final$im_s_verylarge[i] >1){
total_500_final$im_s_verylarge[i] <- 2 #they have images of this size
}else{
total_500_final$im_s_verylarge[i] <- 1 #they do not have images of this size
}
}
par(mfrow=c(1,1))

barplot(table(total_500_final$im_s_verylarge),col = "darkred")

###########################################################################################################
large <- c(23,45,52,65,100,113,118,119,129,137,139,150,159,211,218,232,235,249,265)
total_500_final$im_s_large <- 0
for(i in 1:19){
l <- large[i]
total_500_final$im_s_large <- total_500_final$im_s_large + total_500_final[,l]}
total_500_final$im_s_large <- total_500_final$im_s_large/length(large)
for(i in 1:439){
if (total_500_final$im_s_large[i] >1){
total_500_final$im_s_large[i] <- 2
}
}
par(mfrow=c(1,1))
barplot(table(total_500_final$im_s_large),col = "darkblue")

###########################################################################################################
medium <- c(37,67,68,71,85,87,94,96,121,124,154,180,183,198,207,227,238,269,281,288,290)
total_500_final$im_s_medium <- 0
for(i in 1:21){
m <- medium[i]
total_500_final$im_s_medium <- total_500_final$im_s_medium + total_500_final[,m]}
total_500_final$im_s_medium <- total_500_final$im_s_medium/length(medium)
for(i in 1:439){
if (total_500_final$im_s_medium[i] >1){
total_500_final$im_s_medium[i] <- 2
}
}
par(mfrow=c(1,1))
barplot(table(total_500_final$im_s_medium),col = "darkgreen")

###########################################################################################################
small <- c(25,26,30,31,32,33,38,39,40,43,44,46,47,51,55,57,60,63,72,73,78,79,88,89,90,95,101,102,103,106,107,109,112,117,120,128,135,143,146,147,148,149,151,152,156,160,161,162,163,164,165,166,167,168,170,171,174,189,190,192,194,196,197,199,201,203,204,206,208,225,226,230,231,233,236,236,237,242,244,246,247,248,252,257,263,264,266,268,271,272,273,275,278,279,282,283,284,289)
total_500_final$im_s_small <- 0
for(i in 1:98){
sl <- small[i]
total_500_final$im_s_small <- total_500_final$im_s_small + total_500_final[,sl]}
total_500_final$im_s_small <- total_500_final$im_s_small/length(small)
for(i in 1:439){
if (total_500_final$im_s_small[i] >1){
total_500_final$im_s_small[i] <- 2
}
}
par(mfrow=c(1,1))
barplot(table(total_500_final$im_s_small),col = "red")

###########################################################################################################
thumbnail<- c(22,41,64,84,91,92,93,108,126,127,169,224,240,245,250,27,142,28,144,29,155,35,158,36,172,42,173,175,48,176,49,177,50,178,56,179,58,181,184,59,185,66,186,187,188,77,82,191,86,193,195,104,200,105,202,110,111,205,115,116,223,229,125,239,241,130,132,243,133,134,136,251,253,254,255,256,259,260,261,262,267,270,276)
total_500_final$im_s_thumbnail <- 0
for(i in 1:83){
tl <- thumbnail[i]
total_500_final$im_s_thumbnail <- total_500_final$im_s_thumbnail + total_500_final[,tl]}
total_500_final$im_s_thumbnail <- total_500_final$im_s_thumbnail/length(thumbnail)
for(i in 1:439){
if (total_500_final$im_s_thumbnail[i] >1){
total_500_final$im_s_thumbnail[i] <- 2
}
}
par(mfrow=c(1,1))
barplot(table(total_500_final$im_s_thumbnail),col = "blue")

###########################################################################################################
#Now we will substract the sizes variables an keep only the new ones we created
total_500_final <- total_500_final[,-c(22:290)]
#str(total_500_final)
total_500_final$Market_Value <- as.numeric(total_500_final$Market_Value)
total_500_final$Assets <- as.numeric(total_500_final$Assets)
total_500_final$Revenues <- as.numeric(total_500_final$Revenues)
total_500_final$Total_SH_Equity <- as.numeric(total_500_final$Total_SH_Equity)
#We will try to create a regression model to see which of the variables of the websites play the most important part regarding the Ranking of the company.
#We create the empty lm model
model_null = lm(Ranking~1,data=total_500_final)
summary(model_null)
##
## Call:
## lm(formula = Ranking ~ 1, data = total_500_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97722 0.02278 0.02278 0.02278 0.02278
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.977221 0.007129 277.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1494 on 438 degrees of freedom
#And we create a full model to check which variables influence the ranking
full_model <- lm(Ranking~.,data=total_500_final)
anova(full_model)
## Analysis of Variance Table
##
## Response: Ranking
## Df Sum Sq Mean Sq F value Pr(>F)
## Assets 1 0.1588 0.1588 75.4020 < 2.2e-16 ***
## Market_Value 1 0.0484 0.0484 22.9541 2.323e-06 ***
## Revenues 1 1.8686 1.8686 886.9893 < 2.2e-16 ***
## Total_SH_Equity 1 0.0040 0.0040 1.9128 0.1674056
## non.document.error 1 0.0297 0.0297 14.0846 0.0002000 ***
## number_of_errors 1 0.1429 0.1429 67.8482 2.395e-15 ***
## number_of_warning 1 0.0000 0.0000 0.0042 0.9483236
## external 1 0.0001 0.0001 0.0281 0.8669507
## internal 1 0.0295 0.0295 13.9832 0.0002107 ***
## facebook 1 0.0035 0.0035 1.6757 0.1962250
## instagram 1 0.0002 0.0002 0.0917 0.7622313
## linkedin 1 0.0171 0.0171 8.1325 0.0045678 **
## pinterest 1 0.0033 0.0033 1.5760 0.2100527
## twitter 1 0.0307 0.0307 14.5502 0.0001575 ***
## youtube 1 0.0132 0.0132 6.2670 0.0126883 *
## Readability 1 0.0453 0.0453 21.5025 4.758e-06 ***
## total_words 1 0.0271 0.0271 12.8529 0.0003776 ***
## unique_words 1 0.0355 0.0355 16.8322 4.928e-05 ***
## .gif 1 0.0030 0.0030 1.4141 0.2350671
## .jpg 1 0.0048 0.0048 2.2614 0.1334042
## .png 1 0.0003 0.0003 0.1294 0.7192572
## total.images 1 0.0493 0.0493 23.4204 1.847e-06 ***
## loading.time 1 0.0260 0.0260 12.3445 0.0004915 ***
## im_s_verylarge 1 3.8436 3.8436 1824.4872 < 2.2e-16 ***
## im_s_large 1 0.0004 0.0004 0.1685 0.6816768
## im_s_medium 1 0.4187 0.4187 198.7642 < 2.2e-16 ***
## im_s_small 1 2.1047 2.1047 999.0680 < 2.2e-16 ***
## im_s_thumbnail 1 0.0000 0.0000 0.0039 0.9500487
## Residuals 410 0.8637 0.0021
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#####################################################################################################
#Use of LASSO
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
#for posts_no a first approach (we remove LDA04 since it gives us NA)
full <- lm(Ranking~.,data=total_500_final)
anova(full)
## Analysis of Variance Table
##
## Response: Ranking
## Df Sum Sq Mean Sq F value Pr(>F)
## Assets 1 0.1588 0.1588 75.4020 < 2.2e-16 ***
## Market_Value 1 0.0484 0.0484 22.9541 2.323e-06 ***
## Revenues 1 1.8686 1.8686 886.9893 < 2.2e-16 ***
## Total_SH_Equity 1 0.0040 0.0040 1.9128 0.1674056
## non.document.error 1 0.0297 0.0297 14.0846 0.0002000 ***
## number_of_errors 1 0.1429 0.1429 67.8482 2.395e-15 ***
## number_of_warning 1 0.0000 0.0000 0.0042 0.9483236
## external 1 0.0001 0.0001 0.0281 0.8669507
## internal 1 0.0295 0.0295 13.9832 0.0002107 ***
## facebook 1 0.0035 0.0035 1.6757 0.1962250
## instagram 1 0.0002 0.0002 0.0917 0.7622313
## linkedin 1 0.0171 0.0171 8.1325 0.0045678 **
## pinterest 1 0.0033 0.0033 1.5760 0.2100527
## twitter 1 0.0307 0.0307 14.5502 0.0001575 ***
## youtube 1 0.0132 0.0132 6.2670 0.0126883 *
## Readability 1 0.0453 0.0453 21.5025 4.758e-06 ***
## total_words 1 0.0271 0.0271 12.8529 0.0003776 ***
## unique_words 1 0.0355 0.0355 16.8322 4.928e-05 ***
## .gif 1 0.0030 0.0030 1.4141 0.2350671
## .jpg 1 0.0048 0.0048 2.2614 0.1334042
## .png 1 0.0003 0.0003 0.1294 0.7192572
## total.images 1 0.0493 0.0493 23.4204 1.847e-06 ***
## loading.time 1 0.0260 0.0260 12.3445 0.0004915 ***
## im_s_verylarge 1 3.8436 3.8436 1824.4872 < 2.2e-16 ***
## im_s_large 1 0.0004 0.0004 0.1685 0.6816768
## im_s_medium 1 0.4187 0.4187 198.7642 < 2.2e-16 ***
## im_s_small 1 2.1047 2.1047 999.0680 < 2.2e-16 ***
## im_s_thumbnail 1 0.0000 0.0000 0.0039 0.9500487
## Residuals 410 0.8637 0.0021
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
x <- model.matrix(full) [,-1]
dim(x)
## [1] 439 30
lasso <- glmnet (x, total_500_final$Ranking)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(lasso,label=T)

plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final$Ranking)
lassob$lambda.min
## [1] 0.007201957
lassob$lambda.1se
## [1] 0.0462947
plot(lassob)

#coefiecinets for lammda min with the min CV - MSE for posts3
blasso <- coef(lassob, s="lambda.min")
blasso
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.10129508
## Assets .
## Market_Value .
## Revenues .
## Total_SH_Equity .
## The_page_opened .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal .
## total.links .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg .
## .png .
## total.images .
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium 0.04685267
## im_s_small 0.90163323
## im_s_thumbnail .
dim(blasso)
## [1] 31 1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full) [-1] * apply (x,2,sd)
azbolt <- abs(zbolt)
sum(azbolt)
## [1] NA
#since the sum is NA that means we have to substract some variables
# in order to find which variables to substract we run the coefficients and we see which of them has NA as result
coef(full)
## (Intercept) Assets Market_Value
## -1.569406e-02 -3.204483e-03 5.082979e-03
## Revenues Total_SH_Equity The_page_opened
## 3.734890e-03 2.654699e-04 NA
## non.document.error number_of_errors number_of_warning
## -3.882069e-04 1.433081e-05 2.054670e-04
## external internal total.links
## -2.442644e-04 -8.831323e-07 NA
## facebook instagram linkedin
## 4.401800e-03 -8.568218e-03 -1.006576e-02
## pinterest twitter youtube
## -2.055757e-02 5.428349e-03 4.144158e-03
## Readability total_words unique_words
## 1.139261e-03 -3.881897e-05 6.132327e-05
## .gif .jpg .png
## 1.563168e-04 1.118662e-04 -4.115544e-05
## total.images loading.time im_s_verylarge
## 4.186452e-05 4.720366e-03 -7.738989e-03
## im_s_large im_s_medium im_s_small
## 2.097589e-03 7.177551e-02 9.378488e-01
## im_s_thumbnail
## 2.873606e-03
#Now we create a new model with only the variables with coef different from NA
total_500_final_r <- total_500_final[,-c(6,12)]
full_2 <- lm(Ranking~.,data=total_500_final_r)
anova(full_2)
## Analysis of Variance Table
##
## Response: Ranking
## Df Sum Sq Mean Sq F value Pr(>F)
## Assets 1 0.1588 0.1588 75.4020 < 2.2e-16 ***
## Market_Value 1 0.0484 0.0484 22.9541 2.323e-06 ***
## Revenues 1 1.8686 1.8686 886.9893 < 2.2e-16 ***
## Total_SH_Equity 1 0.0040 0.0040 1.9128 0.1674056
## non.document.error 1 0.0297 0.0297 14.0846 0.0002000 ***
## number_of_errors 1 0.1429 0.1429 67.8482 2.395e-15 ***
## number_of_warning 1 0.0000 0.0000 0.0042 0.9483236
## external 1 0.0001 0.0001 0.0281 0.8669507
## internal 1 0.0295 0.0295 13.9832 0.0002107 ***
## facebook 1 0.0035 0.0035 1.6757 0.1962250
## instagram 1 0.0002 0.0002 0.0917 0.7622313
## linkedin 1 0.0171 0.0171 8.1325 0.0045678 **
## pinterest 1 0.0033 0.0033 1.5760 0.2100527
## twitter 1 0.0307 0.0307 14.5502 0.0001575 ***
## youtube 1 0.0132 0.0132 6.2670 0.0126883 *
## Readability 1 0.0453 0.0453 21.5025 4.758e-06 ***
## total_words 1 0.0271 0.0271 12.8529 0.0003776 ***
## unique_words 1 0.0355 0.0355 16.8322 4.928e-05 ***
## .gif 1 0.0030 0.0030 1.4141 0.2350671
## .jpg 1 0.0048 0.0048 2.2614 0.1334042
## .png 1 0.0003 0.0003 0.1294 0.7192572
## total.images 1 0.0493 0.0493 23.4204 1.847e-06 ***
## loading.time 1 0.0260 0.0260 12.3445 0.0004915 ***
## im_s_verylarge 1 3.8436 3.8436 1824.4872 < 2.2e-16 ***
## im_s_large 1 0.0004 0.0004 0.1685 0.6816768
## im_s_medium 1 0.4187 0.4187 198.7642 < 2.2e-16 ***
## im_s_small 1 2.1047 2.1047 999.0680 < 2.2e-16 ***
## im_s_thumbnail 1 0.0000 0.0000 0.0039 0.9500487
## Residuals 410 0.8637 0.0021
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
x <- model.matrix(full_2) [,-1]
dim(x)
## [1] 439 28
lasso <- glmnet (x, total_500_final_r$Ranking)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(lasso,label=T)

plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final_r$Ranking)
lassob$lambda.min
## [1] 0.007201957
lassob$lambda.1se
## [1] 0.04218201
plot(lassob)

#coefiecinets for lammda min with the min CV - MSE for posts3
blasso <- coef(lassob, s="lambda.min")
blasso
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.10129508
## Assets .
## Market_Value .
## Revenues .
## Total_SH_Equity .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg .
## .png .
## total.images .
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium 0.04685267
## im_s_small 0.90163323
## im_s_thumbnail .
dim(blasso)
## [1] 29 1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full_2) [-1] * apply (x,2,sd)
azbolt <- abs(zbolt)
sum(azbolt)
## [1] 0.2218838
s <- sum(abs(zblasso))/sum(abs(azbolt))
s
## [1] 0.6245467
blassob <- coef(lassob, s="lambda.1se")
blassob
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.5915647
## Assets .
## Market_Value .
## Revenues .
## Total_SH_Equity .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg .
## .png .
## total.images .
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium .
## im_s_small 0.7000036
## im_s_thumbnail .
zblassob <- blassob[-1] * apply(x,2,sd)
zboltb <- coef (full_2) [-1] * apply (x,2,sd)
s <- sum(abs(zblassob))/sum(abs(zboltb))
s
## [1] 0.4475699
#We use the forward method to compare the full model woth the null model to see how many variables are indeed important
model_a <- step(model_null, scope = list(lower = model_null, upper=full_2), direction = "forward")
## Start: AIC=-1668.38
## Ranking ~ 1
##
## Df Sum of Sq RSS AIC
## + im_s_small 1 8.7745 0.9977 -2668.1
## + im_s_verylarge 1 5.8092 3.9630 -2062.6
## + im_s_large 1 4.8298 4.9424 -1965.6
## + im_s_medium 1 3.9389 5.8333 -1892.9
## + im_s_thumbnail 1 2.8846 6.8876 -1820.0
## + Revenues 1 2.0449 7.7273 -1769.5
## + number_of_errors 1 0.2891 9.4831 -1679.6
## + Assets 1 0.1588 9.6134 -1673.6
## + loading.time 1 0.1239 9.6484 -1672.0
## + total_words 1 0.1029 9.6693 -1671.0
## + non.document.error 1 0.0847 9.6875 -1670.2
## + number_of_warning 1 0.0632 9.7090 -1669.2
## + internal 1 0.0526 9.7196 -1668.7
## + Readability 1 0.0524 9.7198 -1668.7
## + Market_Value 1 0.0452 9.7270 -1668.4
## <none> 9.7722 -1668.4
## + pinterest 1 0.0287 9.7436 -1667.7
## + total.images 1 0.0219 9.7503 -1667.4
## + external 1 0.0219 9.7504 -1667.4
## + instagram 1 0.0119 9.7603 -1666.9
## + .png 1 0.0098 9.7624 -1666.8
## + unique_words 1 0.0089 9.7633 -1666.8
## + .gif 1 0.0083 9.7639 -1666.8
## + facebook 1 0.0073 9.7649 -1666.7
## + .jpg 1 0.0048 9.7674 -1666.6
## + twitter 1 0.0040 9.7682 -1666.6
## + linkedin 1 0.0026 9.7696 -1666.5
## + youtube 1 0.0015 9.7708 -1666.4
## + Total_SH_Equity 1 0.0000 9.7722 -1666.4
##
## Step: AIC=-2668.12
## Ranking ~ im_s_small
##
## Df Sum of Sq RSS AIC
## + im_s_medium 1 0.064341 0.93333 -2695.4
## + Revenues 1 0.026246 0.97143 -2677.8
## + external 1 0.021898 0.97578 -2675.9
## + pinterest 1 0.021548 0.97613 -2675.7
## + instagram 1 0.008718 0.98896 -2670.0
## <none> 0.99767 -2668.1
## + total_words 1 0.003628 0.99405 -2667.7
## + linkedin 1 0.001890 0.99578 -2666.9
## + youtube 1 0.001786 0.99589 -2666.9
## + facebook 1 0.001448 0.99623 -2666.8
## + .png 1 0.001241 0.99643 -2666.7
## + twitter 1 0.001144 0.99653 -2666.6
## + non.document.error 1 0.000890 0.99678 -2666.5
## + internal 1 0.000805 0.99687 -2666.5
## + Total_SH_Equity 1 0.000246 0.99743 -2666.2
## + .gif 1 0.000166 0.99751 -2666.2
## + unique_words 1 0.000156 0.99752 -2666.2
## + loading.time 1 0.000075 0.99760 -2666.2
## + Market_Value 1 0.000056 0.99762 -2666.1
## + .jpg 1 0.000051 0.99762 -2666.1
## + Readability 1 0.000051 0.99762 -2666.1
## + number_of_warning 1 0.000045 0.99763 -2666.1
## + Assets 1 0.000018 0.99766 -2666.1
## + total.images 1 0.000009 0.99767 -2666.1
## + number_of_errors 1 0.000003 0.99767 -2666.1
## + im_s_verylarge 1 0.000000 0.99767 -2666.1
## + im_s_large 1 0.000000 0.99767 -2666.1
## + im_s_thumbnail 1 0.000000 0.99767 -2666.1
##
## Step: AIC=-2695.38
## Ranking ~ im_s_small + im_s_medium
##
## Df Sum of Sq RSS AIC
## + external 1 0.0210641 0.91227 -2703.4
## + pinterest 1 0.0143318 0.91900 -2700.2
## + instagram 1 0.0088724 0.92446 -2697.6
## + total_words 1 0.0064299 0.92690 -2696.4
## <none> 0.93333 -2695.4
## + linkedin 1 0.0041120 0.92922 -2695.3
## + .png 1 0.0021242 0.93121 -2694.4
## + facebook 1 0.0020931 0.93124 -2694.4
## + youtube 1 0.0020139 0.93132 -2694.3
## + number_of_warning 1 0.0018720 0.93146 -2694.3
## + twitter 1 0.0016338 0.93170 -2694.2
## + internal 1 0.0014642 0.93187 -2694.1
## + unique_words 1 0.0014323 0.93190 -2694.1
## + non.document.error 1 0.0008262 0.93251 -2693.8
## + Market_Value 1 0.0003838 0.93295 -2693.6
## + number_of_errors 1 0.0001157 0.93322 -2693.4
## + Total_SH_Equity 1 0.0001146 0.93322 -2693.4
## + total.images 1 0.0000886 0.93324 -2693.4
## + .gif 1 0.0000426 0.93329 -2693.4
## + loading.time 1 0.0000312 0.93330 -2693.4
## + .jpg 1 0.0000262 0.93331 -2693.4
## + Assets 1 0.0000000 0.93333 -2693.4
## + Revenues 1 0.0000000 0.93333 -2693.4
## + Readability 1 0.0000000 0.93333 -2693.4
## + im_s_verylarge 1 0.0000000 0.93333 -2693.4
## + im_s_large 1 0.0000000 0.93333 -2693.4
## + im_s_thumbnail 1 0.0000000 0.93333 -2693.4
##
## Step: AIC=-2703.4
## Ranking ~ im_s_small + im_s_medium + external
##
## Df Sum of Sq RSS AIC
## + pinterest 1 0.0122667 0.90000 -2707.3
## + instagram 1 0.0061065 0.90616 -2704.3
## + number_of_warning 1 0.0055303 0.90674 -2704.1
## <none> 0.91227 -2703.4
## + linkedin 1 0.0037096 0.90856 -2703.2
## + total.images 1 0.0032128 0.90906 -2702.9
## + .jpg 1 0.0031131 0.90916 -2702.9
## + total_words 1 0.0013087 0.91096 -2702.0
## + number_of_errors 1 0.0008097 0.91146 -2701.8
## + facebook 1 0.0008004 0.91147 -2701.8
## + internal 1 0.0007505 0.91152 -2701.8
## + .gif 1 0.0006621 0.91161 -2701.7
## + twitter 1 0.0005186 0.91175 -2701.7
## + youtube 1 0.0005185 0.91175 -2701.7
## + loading.time 1 0.0002621 0.91201 -2701.5
## + Market_Value 1 0.0002563 0.91201 -2701.5
## + unique_words 1 0.0002136 0.91206 -2701.5
## + Revenues 1 0.0001358 0.91213 -2701.5
## + .png 1 0.0000641 0.91221 -2701.4
## + Total_SH_Equity 1 0.0000160 0.91225 -2701.4
## + non.document.error 1 0.0000091 0.91226 -2701.4
## + im_s_thumbnail 1 0.0000078 0.91226 -2701.4
## + Assets 1 0.0000049 0.91226 -2701.4
## + Readability 1 0.0000033 0.91227 -2701.4
## + im_s_verylarge 1 0.0000012 0.91227 -2701.4
## + im_s_large 1 0.0000002 0.91227 -2701.4
##
## Step: AIC=-2707.35
## Ranking ~ im_s_small + im_s_medium + external + pinterest
##
## Df Sum of Sq RSS AIC
## + number_of_warning 1 0.0069133 0.89309 -2708.7
## + total.images 1 0.0064502 0.89355 -2708.5
## <none> 0.90000 -2707.3
## + .jpg 1 0.0040342 0.89597 -2707.3
## + linkedin 1 0.0036012 0.89640 -2707.1
## + .gif 1 0.0035009 0.89650 -2707.1
## + number_of_errors 1 0.0016621 0.89834 -2706.2
## + instagram 1 0.0014683 0.89853 -2706.1
## + Revenues 1 0.0004156 0.89959 -2705.6
## + Market_Value 1 0.0002176 0.89978 -2705.4
## + loading.time 1 0.0001872 0.89982 -2705.4
## + internal 1 0.0001821 0.89982 -2705.4
## + im_s_thumbnail 1 0.0001261 0.89988 -2705.4
## + total_words 1 0.0001013 0.89990 -2705.4
## + im_s_large 1 0.0000249 0.89998 -2705.4
## + non.document.error 1 0.0000185 0.89998 -2705.4
## + im_s_verylarge 1 0.0000103 0.89999 -2705.3
## + twitter 1 0.0000084 0.89999 -2705.3
## + youtube 1 0.0000033 0.90000 -2705.3
## + Assets 1 0.0000030 0.90000 -2705.3
## + Readability 1 0.0000030 0.90000 -2705.3
## + .png 1 0.0000026 0.90000 -2705.3
## + Total_SH_Equity 1 0.0000017 0.90000 -2705.3
## + unique_words 1 0.0000006 0.90000 -2705.3
## + facebook 1 0.0000001 0.90000 -2705.3
##
## Step: AIC=-2708.73
## Ranking ~ im_s_small + im_s_medium + external + pinterest + number_of_warning
##
## Df Sum of Sq RSS AIC
## + total.images 1 0.0070871 0.88600 -2710.2
## + .jpg 1 0.0044739 0.88862 -2708.9
## <none> 0.89309 -2708.7
## + linkedin 1 0.0039921 0.88910 -2708.7
## + .gif 1 0.0032994 0.88979 -2708.4
## + instagram 1 0.0014505 0.89164 -2707.4
## + number_of_errors 1 0.0006708 0.89242 -2707.1
## + Revenues 1 0.0005667 0.89252 -2707.0
## + internal 1 0.0004682 0.89262 -2707.0
## + im_s_thumbnail 1 0.0002492 0.89284 -2706.8
## + loading.time 1 0.0002400 0.89285 -2706.8
## + total_words 1 0.0001767 0.89291 -2706.8
## + im_s_verylarge 1 0.0001569 0.89293 -2706.8
## + non.document.error 1 0.0001452 0.89294 -2706.8
## + im_s_large 1 0.0001347 0.89295 -2706.8
## + Market_Value 1 0.0001242 0.89296 -2706.8
## + twitter 1 0.0000110 0.89308 -2706.7
## + Total_SH_Equity 1 0.0000058 0.89308 -2706.7
## + .png 1 0.0000034 0.89309 -2706.7
## + unique_words 1 0.0000022 0.89309 -2706.7
## + youtube 1 0.0000012 0.89309 -2706.7
## + Assets 1 0.0000010 0.89309 -2706.7
## + Readability 1 0.0000006 0.89309 -2706.7
## + facebook 1 0.0000001 0.89309 -2706.7
##
## Step: AIC=-2710.23
## Ranking ~ im_s_small + im_s_medium + external + pinterest + number_of_warning +
## total.images
##
## Df Sum of Sq RSS AIC
## <none> 0.88600 -2710.2
## + linkedin 1 0.0033296 0.88267 -2709.9
## + .jpg 1 0.0020290 0.88397 -2709.2
## + instagram 1 0.0020129 0.88399 -2709.2
## + total_words 1 0.0019668 0.88404 -2709.2
## + .png 1 0.0019107 0.88409 -2709.2
## + internal 1 0.0010181 0.88498 -2708.7
## + .gif 1 0.0006662 0.88534 -2708.6
## + unique_words 1 0.0002712 0.88573 -2708.4
## + Market_Value 1 0.0001866 0.88582 -2708.3
## + im_s_verylarge 1 0.0001838 0.88582 -2708.3
## + number_of_errors 1 0.0001641 0.88584 -2708.3
## + Revenues 1 0.0001632 0.88584 -2708.3
## + loading.time 1 0.0001548 0.88585 -2708.3
## + im_s_thumbnail 1 0.0001396 0.88586 -2708.3
## + non.document.error 1 0.0001234 0.88588 -2708.3
## + im_s_large 1 0.0001143 0.88589 -2708.3
## + Total_SH_Equity 1 0.0000185 0.88598 -2708.2
## + twitter 1 0.0000072 0.88599 -2708.2
## + Readability 1 0.0000069 0.88600 -2708.2
## + youtube 1 0.0000037 0.88600 -2708.2
## + facebook 1 0.0000011 0.88600 -2708.2
## + Assets 1 0.0000004 0.88600 -2708.2
summary(model_a)
##
## Call:
## lm(formula = Ranking ~ im_s_small + im_s_medium + external +
## pinterest + number_of_warning + total.images, data = total_500_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88600 -0.00365 -0.00251 -0.00035 0.08983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.131e-03 3.056e-02 -0.168 0.866762
## im_s_small 9.389e-01 1.919e-02 48.937 < 2e-16 ***
## im_s_medium 6.493e-02 1.210e-02 5.366 1.31e-07 ***
## external -2.441e-04 6.394e-05 -3.818 0.000154 ***
## pinterest -2.201e-02 7.586e-03 -2.902 0.003899 **
## number_of_warning 1.954e-04 1.018e-04 1.919 0.055682 .
## total.images 3.595e-05 1.934e-05 1.859 0.063720 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04529 on 432 degrees of freedom
## Multiple R-squared: 0.9093, Adjusted R-squared: 0.9081
## F-statistic: 722.1 on 6 and 432 DF, p-value: < 2.2e-16
ad_r_sq_ma <- summary(model_a)$adj.r.squared
ad_r_sq_ma
## [1] 0.9080753
aic_ma <- AIC(model_a)
aic_ma
## [1] -1462.402
par(mfrow=c(3,1))
plot(model_a,which=1:3)

1
## [1] 1
2
## [1] 2
3
## [1] 3
#We create the intevals of the model
confint(model_a)
## 2.5 % 97.5 %
## (Intercept) -6.520131e-02 0.0549399347
## im_s_small 9.011789e-01 0.9765959295
## im_s_medium 4.114544e-02 0.0887060981
## external -3.698321e-04 -0.0001184734
## pinterest -3.692400e-02 -0.0071035844
## number_of_warning -4.763406e-06 0.0003955086
## total.images -2.061062e-06 0.0000739683
#From this model we can conlude that for the Ranking the variables that play the most important role are
#whether or not they use small images, medium images, the number of external links, whether or not they have pinterest, the number of warnings in the html code and the number of total images
######################################################################################################
#Next we will try to see which variables play more important roll for the market value of a company
#We create the empty lm model
MV_model_null = lm(Market_Value~1,data=total_500_final)
summary(MV_model_null)
##
## Call:
## lm(formula = Market_Value ~ 1, data = total_500_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.02733 -0.02733 -0.02733 -0.02733 0.97267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.027335 0.007791 131.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1632 on 438 degrees of freedom
#And we create a full model to check which variables influence the Market_Value
MV_full_model <- lm(Market_Value~.,data=total_500_final)
anova(MV_full_model)
## Analysis of Variance Table
##
## Response: Market_Value
## Df Sum Sq Mean Sq F value Pr(>F)
## Ranking 1 0.0540 0.05403 2.0908 0.14895
## Assets 1 0.0084 0.00844 0.3265 0.56803
## Revenues 1 0.0467 0.04674 1.8086 0.17942
## Total_SH_Equity 1 0.4033 0.40329 15.6054 9.186e-05 ***
## non.document.error 1 0.0357 0.03572 1.3821 0.24042
## number_of_errors 1 0.0240 0.02396 0.9273 0.33612
## number_of_warning 1 0.0280 0.02796 1.0821 0.29884
## external 1 0.0066 0.00659 0.2551 0.61378
## internal 1 0.0262 0.02623 1.0148 0.31434
## facebook 1 0.0545 0.05449 2.1084 0.14725
## instagram 1 0.0012 0.00120 0.0465 0.82936
## linkedin 1 0.0132 0.01316 0.5093 0.47584
## pinterest 1 0.0002 0.00017 0.0066 0.93544
## twitter 1 0.0101 0.01009 0.3903 0.53247
## youtube 1 0.0236 0.02361 0.9138 0.33968
## Readability 1 0.0002 0.00016 0.0062 0.93713
## total_words 1 0.0018 0.00183 0.0710 0.79003
## unique_words 1 0.0035 0.00353 0.1366 0.71185
## .gif 1 0.0268 0.02684 1.0384 0.30879
## .jpg 1 0.0120 0.01198 0.4636 0.49632
## .png 1 0.0195 0.01948 0.7540 0.38573
## total.images 1 0.0001 0.00007 0.0026 0.95903
## loading.time 1 0.0051 0.00512 0.1983 0.65636
## im_s_verylarge 1 0.0783 0.07834 3.0316 0.08241 .
## im_s_large 1 0.0236 0.02365 0.9150 0.33936
## im_s_medium 1 0.0009 0.00091 0.0351 0.85141
## im_s_small 1 0.0001 0.00009 0.0034 0.95357
## im_s_thumbnail 1 0.1688 0.16878 6.5311 0.01096 *
## Residuals 410 10.5955 0.02584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#####################################################################################################
#Use of LASSO
library(glmnet)
#for posts_no a first approach (we remove LDA04 since it gives us NA)
MV_full <- lm(Market_Value~.,data=total_500_final)
anova(MV_full)
## Analysis of Variance Table
##
## Response: Market_Value
## Df Sum Sq Mean Sq F value Pr(>F)
## Ranking 1 0.0540 0.05403 2.0908 0.14895
## Assets 1 0.0084 0.00844 0.3265 0.56803
## Revenues 1 0.0467 0.04674 1.8086 0.17942
## Total_SH_Equity 1 0.4033 0.40329 15.6054 9.186e-05 ***
## non.document.error 1 0.0357 0.03572 1.3821 0.24042
## number_of_errors 1 0.0240 0.02396 0.9273 0.33612
## number_of_warning 1 0.0280 0.02796 1.0821 0.29884
## external 1 0.0066 0.00659 0.2551 0.61378
## internal 1 0.0262 0.02623 1.0148 0.31434
## facebook 1 0.0545 0.05449 2.1084 0.14725
## instagram 1 0.0012 0.00120 0.0465 0.82936
## linkedin 1 0.0132 0.01316 0.5093 0.47584
## pinterest 1 0.0002 0.00017 0.0066 0.93544
## twitter 1 0.0101 0.01009 0.3903 0.53247
## youtube 1 0.0236 0.02361 0.9138 0.33968
## Readability 1 0.0002 0.00016 0.0062 0.93713
## total_words 1 0.0018 0.00183 0.0710 0.79003
## unique_words 1 0.0035 0.00353 0.1366 0.71185
## .gif 1 0.0268 0.02684 1.0384 0.30879
## .jpg 1 0.0120 0.01198 0.4636 0.49632
## .png 1 0.0195 0.01948 0.7540 0.38573
## total.images 1 0.0001 0.00007 0.0026 0.95903
## loading.time 1 0.0051 0.00512 0.1983 0.65636
## im_s_verylarge 1 0.0783 0.07834 3.0316 0.08241 .
## im_s_large 1 0.0236 0.02365 0.9150 0.33936
## im_s_medium 1 0.0009 0.00091 0.0351 0.85141
## im_s_small 1 0.0001 0.00009 0.0034 0.95357
## im_s_thumbnail 1 0.1688 0.16878 6.5311 0.01096 *
## Residuals 410 10.5955 0.02584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MV_x <- model.matrix(MV_full) [,-1]
dim(MV_x)
## [1] 439 30
MV_lasso <- glmnet (MV_x, total_500_final$Market_Value)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(MV_lasso,label=T)

plot(MV_lasso, xvar='lambda', label=T)

MV_lassob <- cv.glmnet(MV_x,total_500_final$Market_Value)
MV_lassob$lambda.min
## [1] 0.02930923
MV_lassob$lambda.1se
## [1] 0.02930923
plot(MV_lassob)

#coefiecinets for lammda min with the min CV - MSE for posts3
MV_blasso <- coef(MV_lassob, s="lambda.min")
MV_blasso
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.027335
## Ranking .
## Assets .
## Revenues .
## Total_SH_Equity .
## The_page_opened .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal .
## total.links .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg .
## .png .
## total.images .
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium .
## im_s_small .
## im_s_thumbnail .
dim(MV_blasso)
## [1] 31 1
MV_zblasso <- MV_blasso[-1] * apply(MV_x,2,sd)
MV_zbolt <- coef (MV_full) [-1] * apply (MV_x,2,sd)
MV_azbolt <- abs(MV_zbolt)
sum(MV_azbolt)
## [1] NA
#since the sum is NA that means we have to substract some variables
# in order to find which variables to substract we run the coefficients and we see which of them has NA as result
coef(MV_full)
## (Intercept) Ranking Assets
## 1.501951e+00 6.235402e-02 -3.856147e-02
## Revenues Total_SH_Equity The_page_opened
## 4.164028e-02 9.894300e-02 NA
## non.document.error number_of_errors number_of_warning
## 2.778099e-02 4.892580e-05 4.895412e-04
## external internal total.links
## 2.911111e-06 -4.870999e-05 NA
## facebook instagram linkedin
## -3.091650e-02 7.483706e-03 1.360837e-02
## pinterest twitter youtube
## 2.315149e-03 -1.710294e-02 1.495867e-02
## Readability total_words unique_words
## 2.290723e-02 6.953619e-05 -1.312791e-04
## .gif .jpg .png
## -8.041172e-04 -1.429192e-04 -2.105077e-04
## total.images loading.time im_s_verylarge
## -2.087624e-05 -1.931367e-02 -5.208752e-02
## im_s_large im_s_medium im_s_small
## 6.884222e-02 5.037910e-03 1.053349e-02
## im_s_thumbnail
## -4.071058e-01
#Now we create a new model with only the variables with coef different from NA
MV_total_500_final <- total_500_final[,-c(6,12)]
MV_full_2 <- lm(Market_Value~.,data=MV_total_500_final)
anova(MV_full_2)
## Analysis of Variance Table
##
## Response: Market_Value
## Df Sum Sq Mean Sq F value Pr(>F)
## Ranking 1 0.0540 0.05403 2.0908 0.14895
## Assets 1 0.0084 0.00844 0.3265 0.56803
## Revenues 1 0.0467 0.04674 1.8086 0.17942
## Total_SH_Equity 1 0.4033 0.40329 15.6054 9.186e-05 ***
## non.document.error 1 0.0357 0.03572 1.3821 0.24042
## number_of_errors 1 0.0240 0.02396 0.9273 0.33612
## number_of_warning 1 0.0280 0.02796 1.0821 0.29884
## external 1 0.0066 0.00659 0.2551 0.61378
## internal 1 0.0262 0.02623 1.0148 0.31434
## facebook 1 0.0545 0.05449 2.1084 0.14725
## instagram 1 0.0012 0.00120 0.0465 0.82936
## linkedin 1 0.0132 0.01316 0.5093 0.47584
## pinterest 1 0.0002 0.00017 0.0066 0.93544
## twitter 1 0.0101 0.01009 0.3903 0.53247
## youtube 1 0.0236 0.02361 0.9138 0.33968
## Readability 1 0.0002 0.00016 0.0062 0.93713
## total_words 1 0.0018 0.00183 0.0710 0.79003
## unique_words 1 0.0035 0.00353 0.1366 0.71185
## .gif 1 0.0268 0.02684 1.0384 0.30879
## .jpg 1 0.0120 0.01198 0.4636 0.49632
## .png 1 0.0195 0.01948 0.7540 0.38573
## total.images 1 0.0001 0.00007 0.0026 0.95903
## loading.time 1 0.0051 0.00512 0.1983 0.65636
## im_s_verylarge 1 0.0783 0.07834 3.0316 0.08241 .
## im_s_large 1 0.0236 0.02365 0.9150 0.33936
## im_s_medium 1 0.0009 0.00091 0.0351 0.85141
## im_s_small 1 0.0001 0.00009 0.0034 0.95357
## im_s_thumbnail 1 0.1688 0.16878 6.5311 0.01096 *
## Residuals 410 10.5955 0.02584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MV_x <- model.matrix(MV_full_2) [,-1]
dim(MV_x)
## [1] 439 28
MV_lasso <- glmnet (MV_x, MV_total_500_final$Market_Value)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(MV_lasso,label=T)

plot(MV_lasso, xvar='lambda', label=T)

MV_lassob <- cv.glmnet(MV_x,MV_total_500_final$Market_Value)
MV_lassob$lambda.min
## [1] 0.02930923
MV_lassob$lambda.1se
## [1] 0.02930923
plot(MV_lassob)

#coefiecinets for lammda min with the min CV - MSE for posts3
MV_blasso <- coef(MV_lassob, s="lambda.min")
MV_blasso
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.027335
## Ranking .
## Assets .
## Revenues .
## Total_SH_Equity .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg .
## .png .
## total.images .
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium .
## im_s_small .
## im_s_thumbnail .
dim(MV_blasso)
## [1] 29 1
MV_zblasso <- MV_blasso[-1] * apply(MV_x,2,sd)
MV_zbolt <- coef (MV_full_2) [-1] * apply (MV_x,2,sd)
MV_azbolt <- abs(MV_zbolt)
sum(MV_azbolt)
## [1] 0.2506174
MV_s <- sum(abs(MV_zblasso))/sum(abs(MV_azbolt))
MV_s
## [1] 0
MV_blassob <- coef(MV_lassob, s="lambda.1se")
MV_blassob
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.027335
## Ranking .
## Assets .
## Revenues .
## Total_SH_Equity .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg .
## .png .
## total.images .
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium .
## im_s_small .
## im_s_thumbnail .
MV_zblassob <- MV_blassob[-1] * apply(MV_x,2,sd)
MV_zboltb <- coef (MV_full_2) [-1] * apply (MV_x,2,sd)
MV_s <- sum(abs(MV_zblassob))/sum(abs(MV_zboltb))
MV_s
## [1] 0
#We use the forward method to compare the full model woth the null model to see how many variables are indeed important
MV_model_a <- step(MV_model_null, scope = list(lower = MV_model_null, upper=MV_full_2), direction = "forward")
## Start: AIC=-1590.39
## Market_Value ~ 1
##
## Df Sum of Sq RSS AIC
## + Total_SH_Equity 1 0.37711 11.295 -1602.8
## + im_s_thumbnail 1 0.28284 11.389 -1599.2
## + im_s_large 1 0.15078 11.521 -1594.1
## + im_s_verylarge 1 0.11809 11.554 -1592.8
## + facebook 1 0.10793 11.564 -1592.5
## + twitter 1 0.08897 11.583 -1591.8
## + Revenues 1 0.08159 11.590 -1591.5
## + im_s_medium 1 0.07961 11.592 -1591.4
## + im_s_small 1 0.06449 11.607 -1590.8
## + Ranking 1 0.05403 11.618 -1590.4
## <none> 11.672 -1590.4
## + .png 1 0.04474 11.627 -1590.1
## + unique_words 1 0.03835 11.634 -1589.8
## + non.document.error 1 0.03519 11.637 -1589.7
## + total_words 1 0.03297 11.639 -1589.6
## + number_of_warning 1 0.02813 11.644 -1589.5
## + .jpg 1 0.02661 11.645 -1589.4
## + internal 1 0.02521 11.647 -1589.3
## + linkedin 1 0.02238 11.650 -1589.2
## + total.images 1 0.01680 11.655 -1589.0
## + number_of_errors 1 0.01373 11.658 -1588.9
## + .gif 1 0.01177 11.660 -1588.8
## + loading.time 1 0.01037 11.662 -1588.8
## + Readability 1 0.00843 11.664 -1588.7
## + external 1 0.00676 11.665 -1588.6
## + youtube 1 0.00485 11.667 -1588.6
## + Assets 1 0.00378 11.668 -1588.5
## + instagram 1 0.00329 11.669 -1588.5
## + pinterest 1 0.00058 11.671 -1588.4
##
## Step: AIC=-1602.81
## Market_Value ~ Total_SH_Equity
##
## Df Sum of Sq RSS AIC
## + im_s_thumbnail 1 0.301199 10.994 -1612.7
## + im_s_large 1 0.133903 11.161 -1606.0
## + im_s_verylarge 1 0.106967 11.188 -1605.0
## + Revenues 1 0.103940 11.191 -1604.9
## + im_s_medium 1 0.083775 11.211 -1604.1
## + facebook 1 0.072114 11.223 -1603.6
## + im_s_small 1 0.062508 11.232 -1603.2
## + Ranking 1 0.053733 11.241 -1602.9
## <none> 11.295 -1602.8
## + twitter 1 0.050742 11.244 -1602.8
## + number_of_warning 1 0.033756 11.261 -1602.1
## + .png 1 0.030635 11.264 -1602.0
## + non.document.error 1 0.026540 11.268 -1601.8
## + number_of_errors 1 0.023128 11.272 -1601.7
## + internal 1 0.020778 11.274 -1601.6
## + unique_words 1 0.020330 11.274 -1601.6
## + total_words 1 0.016564 11.278 -1601.5
## + .jpg 1 0.013735 11.281 -1601.3
## + .gif 1 0.013620 11.281 -1601.3
## + total.images 1 0.010776 11.284 -1601.2
## + Assets 1 0.007083 11.288 -1601.1
## + loading.time 1 0.006700 11.288 -1601.1
## + linkedin 1 0.006265 11.289 -1601.0
## + Readability 1 0.003535 11.291 -1600.9
## + external 1 0.002892 11.292 -1600.9
## + instagram 1 0.001239 11.294 -1600.8
## + youtube 1 0.000458 11.294 -1600.8
## + pinterest 1 0.000039 11.295 -1600.8
##
## Step: AIC=-1612.67
## Market_Value ~ Total_SH_Equity + im_s_thumbnail
##
## Df Sum of Sq RSS AIC
## + facebook 1 0.074041 10.920 -1613.6
## + loading.time 1 0.052752 10.941 -1612.8
## + internal 1 0.050184 10.944 -1612.7
## <none> 10.994 -1612.7
## + Revenues 1 0.037088 10.957 -1612.2
## + twitter 1 0.036788 10.957 -1612.1
## + non.document.error 1 0.036226 10.957 -1612.1
## + total_words 1 0.035328 10.958 -1612.1
## + .gif 1 0.032314 10.961 -1612.0
## + .png 1 0.031253 10.962 -1611.9
## + number_of_warning 1 0.030961 10.963 -1611.9
## + unique_words 1 0.023851 10.970 -1611.6
## + Readability 1 0.022685 10.971 -1611.6
## + total.images 1 0.022109 10.972 -1611.6
## + .jpg 1 0.016266 10.977 -1611.3
## + im_s_medium 1 0.011401 10.982 -1611.1
## + im_s_large 1 0.008765 10.985 -1611.0
## + im_s_verylarge 1 0.007266 10.986 -1611.0
## + Assets 1 0.006372 10.987 -1610.9
## + Ranking 1 0.006277 10.987 -1610.9
## + im_s_small 1 0.006274 10.987 -1610.9
## + external 1 0.003810 10.990 -1610.8
## + linkedin 1 0.003341 10.990 -1610.8
## + instagram 1 0.002386 10.991 -1610.8
## + pinterest 1 0.001821 10.992 -1610.7
## + youtube 1 0.000901 10.993 -1610.7
## + number_of_errors 1 0.000100 10.994 -1610.7
##
## Step: AIC=-1613.64
## Market_Value ~ Total_SH_Equity + im_s_thumbnail + facebook
##
## Df Sum of Sq RSS AIC
## <none> 10.920 -1613.6
## + loading.time 1 0.041941 10.878 -1613.3
## + number_of_warning 1 0.035441 10.884 -1613.1
## + Revenues 1 0.034703 10.885 -1613.0
## + non.document.error 1 0.028640 10.891 -1612.8
## + .gif 1 0.028568 10.891 -1612.8
## + .png 1 0.027321 10.892 -1612.7
## + internal 1 0.027315 10.892 -1612.7
## + youtube 1 0.022411 10.897 -1612.5
## + Readability 1 0.020159 10.899 -1612.5
## + total.images 1 0.016145 10.903 -1612.3
## + total_words 1 0.011390 10.908 -1612.1
## + .jpg 1 0.010887 10.909 -1612.1
## + im_s_medium 1 0.010316 10.909 -1612.0
## + linkedin 1 0.009650 10.910 -1612.0
## + im_s_large 1 0.009103 10.911 -1612.0
## + Assets 1 0.008412 10.911 -1612.0
## + unique_words 1 0.008150 10.912 -1612.0
## + im_s_verylarge 1 0.006549 10.913 -1611.9
## + im_s_small 1 0.005609 10.914 -1611.9
## + Ranking 1 0.005106 10.915 -1611.8
## + instagram 1 0.003125 10.916 -1611.8
## + external 1 0.000901 10.919 -1611.7
## + number_of_errors 1 0.000870 10.919 -1611.7
## + pinterest 1 0.000799 10.919 -1611.7
## + twitter 1 0.000043 10.920 -1611.6
summary(MV_model_a)
##
## Call:
## lm(formula = Market_Value ~ Total_SH_Equity + im_s_thumbnail +
## facebook, data = total_500_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32440 -0.03211 -0.00531 -0.00531 0.99469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.57365 0.18503 8.505 2.95e-16 ***
## Total_SH_Equity 0.09663 0.02558 3.778 0.000180 ***
## im_s_thumbnail -0.31909 0.09182 -3.475 0.000562 ***
## facebook -0.02680 0.01560 -1.717 0.086614 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1584 on 435 degrees of freedom
## Multiple R-squared: 0.06446, Adjusted R-squared: 0.05801
## F-statistic: 9.99 on 3 and 435 DF, p-value: 2.221e-06
MV_ad_r_sq_ma <- summary(MV_model_a)$adj.r.squared
MV_ad_r_sq_ma
## [1] 0.05800616
MV_aic_ma <- AIC(MV_model_a)
MV_aic_ma
## [1] -365.8106
par(mfrow=c(3,1))
plot(MV_model_a,which=1:3)

1
## [1] 1
2
## [1] 2
3
## [1] 3
#We create the intevals of the model
confint(MV_model_a)
## 2.5 % 97.5 %
## (Intercept) 1.20998067 1.937322309
## Total_SH_Equity 0.04636533 0.146902783
## im_s_thumbnail -0.49956394 -0.138613799
## facebook -0.05745946 0.003869425
#From this model we can conlude that for the Market Value the variables that play the most important role are
#whether or not they use thumbnail images,whether or not they have facebook and the total sh equity
######################################################################################################
#Next we will try to see which variables play more important roll for the Revenue of a company
#We create the empty lm model
RV_model_null = lm(Revenues~1,data=total_500_final)
summary(RV_model_null)
##
## Call:
## lm(formula = Revenues ~ 1, data = total_500_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1002 -0.1002 -0.1002 -0.1002 0.8998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.10023 0.01435 76.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3006 on 438 degrees of freedom
#And we create a full model to check which variables influence the Market_Value
RV_full_model <- lm(Revenues~.,data=total_500_final)
anova(RV_full_model)
## Analysis of Variance Table
##
## Response: Revenues
## Df Sum Sq Mean Sq F value Pr(>F)
## Ranking 1 8.2846 8.2846 202.1335 < 2.2e-16 ***
## Assets 1 0.5825 0.5825 14.2118 0.0001873 ***
## Market_Value 1 0.1237 0.1237 3.0178 0.0831054 .
## Total_SH_Equity 1 0.2247 0.2247 5.4827 0.0196833 *
## non.document.error 1 0.0517 0.0517 1.2626 0.2618206
## number_of_errors 1 0.0013 0.0013 0.0324 0.8572534
## number_of_warning 1 0.1441 0.1441 3.5150 0.0615271 .
## external 1 0.0198 0.0198 0.4822 0.4878221
## internal 1 0.1535 0.1535 3.7461 0.0536188 .
## facebook 1 0.0104 0.0104 0.2540 0.6145417
## instagram 1 0.0182 0.0182 0.4449 0.5051348
## linkedin 1 0.1909 0.1909 4.6585 0.0314803 *
## pinterest 1 0.3777 0.3777 9.2148 0.0025538 **
## twitter 1 0.0028 0.0028 0.0676 0.7949710
## youtube 1 0.1255 0.1255 3.0631 0.0808387 .
## Readability 1 0.0211 0.0211 0.5153 0.4732417
## total_words 1 0.1310 0.1310 3.1952 0.0745920 .
## unique_words 1 0.0158 0.0158 0.3866 0.5344593
## .gif 1 0.0049 0.0049 0.1188 0.7305354
## .jpg 1 0.0436 0.0436 1.0648 0.3027239
## .png 1 0.0001 0.0001 0.0019 0.9653418
## total.images 1 0.7535 0.7535 18.3853 2.252e-05 ***
## loading.time 1 0.0026 0.0026 0.0642 0.8001070
## im_s_verylarge 1 0.0114 0.0114 0.2791 0.5975791
## im_s_large 1 0.0244 0.0244 0.5943 0.4412207
## im_s_medium 1 11.4280 11.4280 278.8276 < 2.2e-16 ***
## im_s_small 1 0.0083 0.0083 0.2026 0.6529015
## im_s_thumbnail 1 0.0295 0.0295 0.7197 0.3967292
## Residuals 410 16.8042 0.0410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#####################################################################################################
#Use of LASSO
library(glmnet)
#for posts_no a first approach (we remove LDA04 since it gives us NA)
RV_full <- lm(Revenues~.,data=total_500_final)
anova(RV_full)
## Analysis of Variance Table
##
## Response: Revenues
## Df Sum Sq Mean Sq F value Pr(>F)
## Ranking 1 8.2846 8.2846 202.1335 < 2.2e-16 ***
## Assets 1 0.5825 0.5825 14.2118 0.0001873 ***
## Market_Value 1 0.1237 0.1237 3.0178 0.0831054 .
## Total_SH_Equity 1 0.2247 0.2247 5.4827 0.0196833 *
## non.document.error 1 0.0517 0.0517 1.2626 0.2618206
## number_of_errors 1 0.0013 0.0013 0.0324 0.8572534
## number_of_warning 1 0.1441 0.1441 3.5150 0.0615271 .
## external 1 0.0198 0.0198 0.4822 0.4878221
## internal 1 0.1535 0.1535 3.7461 0.0536188 .
## facebook 1 0.0104 0.0104 0.2540 0.6145417
## instagram 1 0.0182 0.0182 0.4449 0.5051348
## linkedin 1 0.1909 0.1909 4.6585 0.0314803 *
## pinterest 1 0.3777 0.3777 9.2148 0.0025538 **
## twitter 1 0.0028 0.0028 0.0676 0.7949710
## youtube 1 0.1255 0.1255 3.0631 0.0808387 .
## Readability 1 0.0211 0.0211 0.5153 0.4732417
## total_words 1 0.1310 0.1310 3.1952 0.0745920 .
## unique_words 1 0.0158 0.0158 0.3866 0.5344593
## .gif 1 0.0049 0.0049 0.1188 0.7305354
## .jpg 1 0.0436 0.0436 1.0648 0.3027239
## .png 1 0.0001 0.0001 0.0019 0.9653418
## total.images 1 0.7535 0.7535 18.3853 2.252e-05 ***
## loading.time 1 0.0026 0.0026 0.0642 0.8001070
## im_s_verylarge 1 0.0114 0.0114 0.2791 0.5975791
## im_s_large 1 0.0244 0.0244 0.5943 0.4412207
## im_s_medium 1 11.4280 11.4280 278.8276 < 2.2e-16 ***
## im_s_small 1 0.0083 0.0083 0.2026 0.6529015
## im_s_thumbnail 1 0.0295 0.0295 0.7197 0.3967292
## Residuals 410 16.8042 0.0410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RV_x <- model.matrix(RV_full) [,-1]
dim(RV_x)
## [1] 439 30
RV_lasso <- glmnet (RV_x, total_500_final$Revenues)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(RV_lasso,label=T)

plot(RV_lasso, xvar='lambda', label=T)

RV_lassob <- cv.glmnet(RV_x,total_500_final$Revenues)
RV_lassob$lambda.min
## [1] 0.0100434
RV_lassob$lambda.1se
## [1] 0.1027973
plot(RV_lassob)

#coefiecinets for lammda min with the min CV - MSE for posts3
RV_blasso <- coef(RV_lassob, s="lambda.min")
RV_blasso
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.566295e+00
## Ranking .
## Assets 2.869487e-01
## Market_Value 2.389096e-03
## Total_SH_Equity -1.998814e-02
## The_page_opened .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal -1.094212e-05
## total.links .
## facebook .
## instagram .
## linkedin .
## pinterest 1.589603e-02
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg -2.824092e-04
## .png .
## total.images 2.178153e-04
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium -8.954334e-01
## im_s_small .
## im_s_thumbnail .
dim(RV_blasso)
## [1] 31 1
RV_zblasso <- RV_blasso[-1] * apply(RV_x,2,sd)
RV_zbolt <- coef (RV_full) [-1] * apply (RV_x,2,sd)
RV_azbolt <- abs(RV_zbolt)
sum(RV_azbolt)
## [1] NA
#since the sum is NA that means we have to substract some variables
# in order to find which variables to substract we run the coefficients and we see which of them has NA as result
coef(RV_full)
## (Intercept) Ranking Assets
## 2.453478e+00 7.266410e-02 4.133751e-01
## Market_Value Total_SH_Equity The_page_opened
## 6.604040e-02 -6.336724e-02 NA
## non.document.error number_of_errors number_of_warning
## -2.621713e-02 -4.710816e-05 -4.425151e-04
## external internal total.links
## 2.499094e-04 -2.271160e-05 NA
## facebook instagram linkedin
## -1.371884e-02 1.230545e-02 -1.057591e-02
## pinterest twitter youtube
## 4.837637e-02 -1.636043e-02 3.570436e-02
## Readability total_words unique_words
## 2.940211e-02 -5.534889e-05 1.307116e-04
## .gif .jpg .png
## -9.776969e-04 -8.735707e-04 -7.121376e-04
## total.images loading.time im_s_verylarge
## 4.616179e-04 -9.597185e-03 9.754948e-03
## im_s_large im_s_medium im_s_small
## 2.233695e-01 -9.509966e-01 -1.091940e-01
## im_s_thumbnail
## -1.713943e-01
#Now we create a new model with only the variables with coef different from NA
RV_total_500_final <- total_500_final[,-c(6,12)]
RV_full_2 <- lm(Revenues~.,data=RV_total_500_final)
anova(RV_full_2)
## Analysis of Variance Table
##
## Response: Revenues
## Df Sum Sq Mean Sq F value Pr(>F)
## Ranking 1 8.2846 8.2846 202.1335 < 2.2e-16 ***
## Assets 1 0.5825 0.5825 14.2118 0.0001873 ***
## Market_Value 1 0.1237 0.1237 3.0178 0.0831054 .
## Total_SH_Equity 1 0.2247 0.2247 5.4827 0.0196833 *
## non.document.error 1 0.0517 0.0517 1.2626 0.2618206
## number_of_errors 1 0.0013 0.0013 0.0324 0.8572534
## number_of_warning 1 0.1441 0.1441 3.5150 0.0615271 .
## external 1 0.0198 0.0198 0.4822 0.4878221
## internal 1 0.1535 0.1535 3.7461 0.0536188 .
## facebook 1 0.0104 0.0104 0.2540 0.6145417
## instagram 1 0.0182 0.0182 0.4449 0.5051348
## linkedin 1 0.1909 0.1909 4.6585 0.0314803 *
## pinterest 1 0.3777 0.3777 9.2148 0.0025538 **
## twitter 1 0.0028 0.0028 0.0676 0.7949710
## youtube 1 0.1255 0.1255 3.0631 0.0808387 .
## Readability 1 0.0211 0.0211 0.5153 0.4732417
## total_words 1 0.1310 0.1310 3.1952 0.0745920 .
## unique_words 1 0.0158 0.0158 0.3866 0.5344593
## .gif 1 0.0049 0.0049 0.1188 0.7305354
## .jpg 1 0.0436 0.0436 1.0648 0.3027239
## .png 1 0.0001 0.0001 0.0019 0.9653418
## total.images 1 0.7535 0.7535 18.3853 2.252e-05 ***
## loading.time 1 0.0026 0.0026 0.0642 0.8001070
## im_s_verylarge 1 0.0114 0.0114 0.2791 0.5975791
## im_s_large 1 0.0244 0.0244 0.5943 0.4412207
## im_s_medium 1 11.4280 11.4280 278.8276 < 2.2e-16 ***
## im_s_small 1 0.0083 0.0083 0.2026 0.6529015
## im_s_thumbnail 1 0.0295 0.0295 0.7197 0.3967292
## Residuals 410 16.8042 0.0410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RV_x <- model.matrix(RV_full_2) [,-1]
dim(RV_x)
## [1] 439 28
RV_lasso <- glmnet (RV_x, RV_total_500_final$Revenues)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(RV_lasso,label=T)

plot(RV_lasso, xvar='lambda', label=T)

RV_lassob <- cv.glmnet(RV_x,RV_total_500_final$Revenues)
RV_lassob$lambda.min
## [1] 0.007597465
RV_lassob$lambda.1se
## [1] 0.07776241
plot(RV_lassob)

#coefiecinets for lammda min with the min CV - MSE for posts3
RV_blasso <- coef(RV_lassob, s="lambda.min")
RV_blasso
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.554560e+00
## Ranking .
## Assets 3.105024e-01
## Market_Value 1.933201e-02
## Total_SH_Equity -3.018886e-02
## non.document.error -5.296986e-03
## number_of_errors .
## number_of_warning .
## external .
## internal -3.302054e-05
## facebook .
## instagram .
## linkedin .
## pinterest 2.094632e-02
## twitter .
## youtube 4.567198e-03
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg -4.016941e-04
## .png -9.896474e-05
## total.images 2.597004e-04
## loading.time -1.674972e-04
## im_s_verylarge .
## im_s_large .
## im_s_medium -9.030456e-01
## im_s_small .
## im_s_thumbnail .
dim(RV_blasso)
## [1] 29 1
RV_zblasso <- RV_blasso[-1] * apply(RV_x,2,sd)
RV_zbolt <- coef (RV_full_2) [-1] * apply (RV_x,2,sd)
RV_azbolt <- abs(RV_zbolt)
sum(RV_azbolt)
## [1] 0.6096725
RV_s <- sum(abs(RV_zblasso))/sum(abs(RV_azbolt))
RV_s
## [1] 0.5182943
RV_blassob <- coef(RV_lassob, s="lambda.1se")
RV_blassob
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.2863841
## Ranking .
## Assets .
## Market_Value .
## Total_SH_Equity .
## non.document.error .
## number_of_errors .
## number_of_warning .
## external .
## internal .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Readability .
## total_words .
## unique_words .
## .gif .
## .jpg .
## .png .
## total.images .
## loading.time .
## im_s_verylarge .
## im_s_large .
## im_s_medium -0.6097454
## im_s_small .
## im_s_thumbnail .
RV_zblassob <- RV_blassob[-1] * apply(RV_x,2,sd)
RV_zboltb <- coef (RV_full_2) [-1] * apply (RV_x,2,sd)
RV_s <- sum(abs(RV_zblassob))/sum(abs(RV_zboltb))
RV_s
## [1] 0.227621
#We use the forward method to compare the full model woth the null model to see how many variables are indeed important
RV_model_a <- step(RV_model_null, scope = list(lower = RV_model_null, upper=RV_full_2), direction = "forward")
## Start: AIC=-1054.2
## Revenues ~ 1
##
## Df Sum of Sq RSS AIC
## + im_s_medium 1 20.5538 19.036 -1373.7
## + Ranking 1 8.2846 31.305 -1155.3
## + im_s_small 1 7.4388 32.151 -1143.6
## + im_s_verylarge 1 4.9249 34.665 -1110.5
## + im_s_large 1 4.0946 35.495 -1100.1
## + im_s_thumbnail 1 2.4455 37.144 -1080.2
## + Assets 1 1.2632 38.327 -1066.4
## + pinterest 1 0.6042 38.986 -1059.0
## + total.images 1 0.5870 39.003 -1058.8
## + number_of_warning 1 0.4572 39.133 -1057.3
## + number_of_errors 1 0.3583 39.232 -1056.2
## + non.document.error 1 0.2799 39.310 -1055.3
## + Market_Value 1 0.2767 39.313 -1055.3
## <none> 39.590 -1054.2
## + external 1 0.1519 39.438 -1053.9
## + linkedin 1 0.1511 39.439 -1053.9
## + Total_SH_Equity 1 0.1375 39.452 -1053.7
## + unique_words 1 0.0715 39.518 -1053.0
## + youtube 1 0.0508 39.539 -1052.8
## + instagram 1 0.0490 39.541 -1052.7
## + twitter 1 0.0478 39.542 -1052.7
## + .gif 1 0.0439 39.546 -1052.7
## + internal 1 0.0185 39.571 -1052.4
## + .jpg 1 0.0122 39.578 -1052.3
## + loading.time 1 0.0106 39.579 -1052.3
## + facebook 1 0.0089 39.581 -1052.3
## + total_words 1 0.0011 39.589 -1052.2
## + Readability 1 0.0010 39.589 -1052.2
## + .png 1 0.0000 39.590 -1052.2
##
## Step: AIC=-1373.65
## Revenues ~ im_s_medium
##
## Df Sum of Sq RSS AIC
## + Assets 1 0.66387 18.372 -1387.2
## + total.images 1 0.52364 18.512 -1383.9
## + pinterest 1 0.13050 18.906 -1374.7
## + external 1 0.12193 18.914 -1374.5
## + Total_SH_Equity 1 0.10067 18.936 -1374.0
## <none> 19.036 -1373.7
## + non.document.error 1 0.07474 18.961 -1373.4
## + youtube 1 0.07101 18.965 -1373.3
## + instagram 1 0.04772 18.988 -1372.8
## + .gif 1 0.04186 18.994 -1372.6
## + loading.time 1 0.03458 19.002 -1372.5
## + internal 1 0.03261 19.003 -1372.4
## + .jpg 1 0.02319 19.013 -1372.2
## + Market_Value 1 0.02315 19.013 -1372.2
## + Readability 1 0.02167 19.015 -1372.2
## + linkedin 1 0.01085 19.025 -1371.9
## + .png 1 0.00791 19.028 -1371.8
## + number_of_warning 1 0.00581 19.030 -1371.8
## + total_words 1 0.00537 19.031 -1371.8
## + number_of_errors 1 0.00508 19.031 -1371.8
## + twitter 1 0.00125 19.035 -1371.7
## + facebook 1 0.00080 19.035 -1371.7
## + unique_words 1 0.00027 19.036 -1371.7
## + Ranking 1 0.00000 19.036 -1371.7
## + im_s_verylarge 1 0.00000 19.036 -1371.7
## + im_s_large 1 0.00000 19.036 -1371.7
## + im_s_small 1 0.00000 19.036 -1371.7
## + im_s_thumbnail 1 0.00000 19.036 -1371.7
##
## Step: AIC=-1387.24
## Revenues ~ im_s_medium + Assets
##
## Df Sum of Sq RSS AIC
## + total.images 1 0.51970 17.853 -1397.8
## + pinterest 1 0.15634 18.216 -1389.0
## + Total_SH_Equity 1 0.12131 18.251 -1388.1
## + external 1 0.11395 18.258 -1388.0
## <none> 18.372 -1387.2
## + youtube 1 0.06707 18.305 -1386.8
## + non.document.error 1 0.04993 18.322 -1386.4
## + instagram 1 0.04843 18.324 -1386.4
## + .gif 1 0.04620 18.326 -1386.3
## + .jpg 1 0.03919 18.333 -1386.2
## + loading.time 1 0.03908 18.333 -1386.2
## + Market_Value 1 0.02945 18.343 -1385.9
## + internal 1 0.02791 18.344 -1385.9
## + im_s_large 1 0.02208 18.350 -1385.8
## + Readability 1 0.01736 18.355 -1385.7
## + im_s_verylarge 1 0.01731 18.355 -1385.7
## + linkedin 1 0.01372 18.359 -1385.6
## + .png 1 0.00968 18.363 -1385.5
## + im_s_small 1 0.00950 18.363 -1385.5
## + Ranking 1 0.00797 18.364 -1385.4
## + number_of_warning 1 0.00459 18.368 -1385.3
## + total_words 1 0.00421 18.368 -1385.3
## + number_of_errors 1 0.00344 18.369 -1385.3
## + im_s_thumbnail 1 0.00080 18.372 -1385.2
## + twitter 1 0.00067 18.372 -1385.2
## + facebook 1 0.00008 18.372 -1385.2
## + unique_words 1 0.00000 18.372 -1385.2
##
## Step: AIC=-1397.83
## Revenues ~ im_s_medium + Assets + total.images
##
## Df Sum of Sq RSS AIC
## + .jpg 1 0.267578 17.585 -1402.5
## + .png 1 0.113665 17.739 -1398.6
## + Total_SH_Equity 1 0.101196 17.751 -1398.3
## + total_words 1 0.085031 17.767 -1397.9
## + internal 1 0.081181 17.771 -1397.8
## <none> 17.853 -1397.8
## + pinterest 1 0.064665 17.788 -1397.4
## + loading.time 1 0.059938 17.793 -1397.3
## + number_of_errors 1 0.043880 17.809 -1396.9
## + Market_Value 1 0.039884 17.813 -1396.8
## + youtube 1 0.036512 17.816 -1396.7
## + im_s_large 1 0.034563 17.818 -1396.7
## + unique_words 1 0.033990 17.819 -1396.7
## + non.document.error 1 0.029506 17.823 -1396.6
## + im_s_verylarge 1 0.025471 17.827 -1396.5
## + im_s_small 1 0.018713 17.834 -1396.3
## + Ranking 1 0.016398 17.836 -1396.2
## + .gif 1 0.013996 17.839 -1396.2
## + Readability 1 0.013929 17.839 -1396.2
## + number_of_warning 1 0.012135 17.840 -1396.1
## + instagram 1 0.008221 17.844 -1396.0
## + twitter 1 0.007388 17.845 -1396.0
## + linkedin 1 0.006936 17.846 -1396.0
## + facebook 1 0.002768 17.850 -1395.9
## + external 1 0.001565 17.851 -1395.9
## + im_s_thumbnail 1 0.001046 17.852 -1395.9
##
## Step: AIC=-1402.46
## Revenues ~ im_s_medium + Assets + total.images + .jpg
##
## Df Sum of Sq RSS AIC
## + Total_SH_Equity 1 0.124782 17.460 -1403.6
## + .png 1 0.105030 17.480 -1403.1
## <none> 17.585 -1402.5
## + pinterest 1 0.066844 17.518 -1402.1
## + youtube 1 0.062062 17.523 -1402.0
## + internal 1 0.056702 17.528 -1401.9
## + loading.time 1 0.044415 17.541 -1401.6
## + im_s_large 1 0.042615 17.542 -1401.5
## + .gif 1 0.041540 17.544 -1401.5
## + non.document.error 1 0.033883 17.551 -1401.3
## + im_s_verylarge 1 0.033592 17.551 -1401.3
## + Market_Value 1 0.032984 17.552 -1401.3
## + external 1 0.020914 17.564 -1401.0
## + instagram 1 0.020360 17.565 -1401.0
## + im_s_small 1 0.019019 17.566 -1400.9
## + Ranking 1 0.017194 17.568 -1400.9
## + number_of_errors 1 0.015221 17.570 -1400.8
## + total_words 1 0.014224 17.571 -1400.8
## + Readability 1 0.013437 17.572 -1400.8
## + number_of_warning 1 0.010643 17.574 -1400.7
## + linkedin 1 0.003426 17.582 -1400.5
## + twitter 1 0.001606 17.583 -1400.5
## + im_s_thumbnail 1 0.001517 17.584 -1400.5
## + unique_words 1 0.000521 17.584 -1400.5
## + facebook 1 0.000285 17.585 -1400.5
##
## Step: AIC=-1403.59
## Revenues ~ im_s_medium + Assets + total.images + .jpg + Total_SH_Equity
##
## Df Sum of Sq RSS AIC
## + .png 1 0.115197 17.345 -1404.5
## <none> 17.460 -1403.6
## + pinterest 1 0.063646 17.397 -1403.2
## + Market_Value 1 0.061988 17.398 -1403.2
## + internal 1 0.058508 17.402 -1403.1
## + youtube 1 0.050771 17.409 -1402.9
## + loading.time 1 0.048329 17.412 -1402.8
## + .gif 1 0.037867 17.422 -1402.5
## + im_s_large 1 0.036936 17.423 -1402.5
## + im_s_verylarge 1 0.029722 17.430 -1402.3
## + non.document.error 1 0.029181 17.431 -1402.3
## + total_words 1 0.019529 17.441 -1402.1
## + external 1 0.018974 17.441 -1402.1
## + number_of_errors 1 0.018594 17.442 -1402.1
## + instagram 1 0.018162 17.442 -1402.0
## + Readability 1 0.018138 17.442 -1402.0
## + im_s_small 1 0.017613 17.443 -1402.0
## + Ranking 1 0.016378 17.444 -1402.0
## + number_of_warning 1 0.012212 17.448 -1401.9
## + linkedin 1 0.010088 17.450 -1401.8
## + twitter 1 0.006521 17.454 -1401.8
## + facebook 1 0.002428 17.458 -1401.7
## + im_s_thumbnail 1 0.002149 17.458 -1401.6
## + unique_words 1 0.002078 17.458 -1401.6
##
## Step: AIC=-1404.49
## Revenues ~ im_s_medium + Assets + total.images + .jpg + Total_SH_Equity +
## .png
##
## Df Sum of Sq RSS AIC
## <none> 17.345 -1404.5
## + pinterest 1 0.055800 17.289 -1403.9
## + Market_Value 1 0.055573 17.289 -1403.9
## + internal 1 0.052856 17.292 -1403.8
## + youtube 1 0.051667 17.293 -1403.8
## + loading.time 1 0.046308 17.299 -1403.7
## + .gif 1 0.043555 17.302 -1403.6
## + im_s_large 1 0.041585 17.303 -1403.5
## + non.document.error 1 0.035511 17.309 -1403.4
## + im_s_verylarge 1 0.034506 17.311 -1403.4
## + external 1 0.023878 17.321 -1403.1
## + instagram 1 0.020197 17.325 -1403.0
## + Readability 1 0.019582 17.325 -1403.0
## + number_of_errors 1 0.016573 17.328 -1402.9
## + im_s_small 1 0.015931 17.329 -1402.9
## + number_of_warning 1 0.013507 17.331 -1402.8
## + Ranking 1 0.013283 17.332 -1402.8
## + linkedin 1 0.006933 17.338 -1402.7
## + twitter 1 0.006059 17.339 -1402.7
## + total_words 1 0.005085 17.340 -1402.6
## + im_s_thumbnail 1 0.003549 17.341 -1402.6
## + facebook 1 0.002527 17.343 -1402.6
## + unique_words 1 0.000460 17.345 -1402.5
summary(RV_model_a)
##
## Call:
## lm(formula = Revenues ~ im_s_medium + Assets + total.images +
## .jpg + Total_SH_Equity + .png, data = total_500_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43436 -0.05279 -0.04909 -0.02950 0.98250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.601e+00 1.311e-01 19.837 < 2e-16 ***
## im_s_medium -9.357e-01 4.219e-02 -22.176 < 2e-16 ***
## Assets 3.843e-01 9.059e-02 4.243 2.70e-05 ***
## total.images 4.331e-04 9.482e-05 4.568 6.43e-06 ***
## .jpg -7.792e-04 2.936e-04 -2.654 0.00825 **
## Total_SH_Equity -5.927e-02 3.233e-02 -1.833 0.06744 .
## .png -7.074e-04 4.176e-04 -1.694 0.09101 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2004 on 432 degrees of freedom
## Multiple R-squared: 0.5619, Adjusted R-squared: 0.5558
## F-statistic: 92.34 on 6 and 432 DF, p-value: < 2.2e-16
RV_ad_r_sq_ma <- summary(RV_model_a)$adj.r.squared
RV_ad_r_sq_ma
## [1] 0.5557985
RV_aic_ma <- AIC(RV_model_a)
RV_aic_ma
## [1] -156.6661
par(mfrow=c(3,1))
plot(RV_model_a,which=1:3)

1
## [1] 1
2
## [1] 2
3
## [1] 3
#We create the intevals of the model
confint(RV_model_a)
## 2.5 % 97.5 %
## (Intercept) 2.3430304909 2.8583932507
## im_s_medium -1.0185934028 -0.8527375080
## Assets 0.2062814632 0.5623725332
## total.images 0.0002467702 0.0006194894
## .jpg -0.0013561803 -0.0002021286
## Total_SH_Equity -0.1228068417 0.0042720029
## .png -0.0015281930 0.0001134334
#From this model we can conlude that for the Revenue the variables that play the most important role are
#whether or not they use medium images,the assets, total images, jpg, total sh equity, png
#############################################################################################################
#Now that we have a first glimps of the important variables we will check again the variables that can affect the Ranking but we will begin by keeping only the ones that were dimmed important in the previous models but without any other information from the fortune 500 just to see the site variables
#names(total_500_final)
total_500_final_after_reg <- total_500_final[,c(1,9,10,13,16,23,24,25,29,30,31)]
par(mfrow=c(1,1))
corrplot(cor(total_500_final_after_reg),method="number")

model_after <- lm(Ranking~.,data=total_500_final_after_reg)
summary(model_after)
##
## Call:
## lm(formula = Ranking ~ ., data = total_500_final_after_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88190 -0.00384 -0.00254 0.00048 0.08811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.523e-03 5.289e-02 0.086 0.931887
## number_of_warning 1.973e-04 1.023e-04 1.929 0.054385 .
## external -2.539e-04 6.578e-05 -3.859 0.000131 ***
## facebook -4.560e-05 4.648e-03 -0.010 0.992176
## pinterest -2.253e-02 7.890e-03 -2.855 0.004508 **
## .jpg 6.834e-05 6.770e-05 1.010 0.313288
## .png -9.145e-05 9.488e-05 -0.964 0.335662
## total.images 4.018e-05 2.282e-05 1.761 0.078934 .
## im_s_medium 6.525e-02 1.216e-02 5.368 1.31e-07 ***
## im_s_small 9.413e-01 2.209e-02 42.610 < 2e-16 ***
## im_s_thumbnail -7.496e-03 3.233e-02 -0.232 0.816777
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04539 on 428 degrees of freedom
## Multiple R-squared: 0.9098, Adjusted R-squared: 0.9076
## F-statistic: 431.5 on 10 and 428 DF, p-value: < 2.2e-16
#From this model we can confer that the most important variables for a sites ranking good in fortune 500
#Are the number of external links
#Whether they have or not pinterest
#whether they use medium and small images
#And also the number of warning and the total images that the site has